We compare the genomic offset predictions from the four following methods:
Generalized dissimilarity modelling (GDM). See report 9_GeneralizedDissimilarityModelling_GenomicOffsetPredictions.html.
Latent factor mixed model (LFMM) . See report 7_LFMM_IdentificationCandidateSNPs_GenomicOffsetPredictions.html.
Gradient Forest (GF). See report 10_GradientForest_GenomicOffsetPredictions.html.
Redundancy analysis (RDA) and partial RDA (pRDA). See report 11_RedundancyAnalysis_GenomicOffsetPredictions.html.
Code
# Method namesmeth_names <-c("GDM","LFMM","GF","RDA", "pRDA")# population namespop_names <-readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%arrange(pop) %>%pull(pop)# SNP setssnp_sets <-readRDS(here("outputs/list_snp_sets.rds"))
Comparing population ranks
In this section, we compare GO predictions from the different methods (GF, GDM, RDA and LFMM), SNP sets (candidates, control and all SNPs) and GCMs. For that, for each combination of method/SNP set/GCM, we ranked the populations based on the GO predictions, i.e. populations with the lowest rank had the highest GO predictions.
Across methods/SNP sets
We first compared the population ranks from the different combinations method/SNP set. For that, we generated one plot per GCM. We also generated a plot in which the population ranks are not based on an unique GCM but are based on the average GO predictions across all GCMs. In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one combination method/SNP set.
Code
################################################################## Ranking the populations for each combination method/SNP set/GCM################################################################## Extracting the genomic offset predictions from LFMM, GFM and GF# ===============================================================sub_meth_names <-c("LFMM", "GF", "GDM")go_df <-lapply(sub_meth_names, function(meth_name) { go_predictions <-readRDS(file=here(paste0("outputs/",meth_name,"/go_predictions.rds"))) go_predictions %>%lapply(function(snp_set) { snp_set$go %>%lapply(function(gcm){tibble(pop = pop_names,go = gcm) %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names))) }) %>%bind_rows(.id="gcm") }) %>%bind_rows(.id="snp_set")}) %>%setNames(sub_meth_names) %>%bind_rows(.id="method")# Extracting the genomic offset predictions from the RDA and pRDA# ===============================================================# Which method do we use for the RDA? RDA_method <-"predict"# "loadings" or "predict"# loading the genomic offset predictions from the RDArda_pred <-readRDS(file=here(paste0("outputs/RDA/go_predictions_",RDA_method,".rds")))# pRDA (RDA corrected for population structure)prda_pred <-lapply(names(snp_sets), function(set_i){ rda_pred[[set_i]]$go_corrected %>%lapply(function(gcm){ tibble(pop = pop_names,go = gcm) %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names)),snp_set = set_i,method ="pRDA") }) %>%bind_rows(.id="gcm") }) %>%bind_rows()# RDA (RDA not corrected for population structure)go_df <-lapply(names(snp_sets), function(set_i){ rda_pred[[set_i]]$go_uncorrected %>%lapply(function(gcm){ tibble(pop = pop_names,go = gcm) %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names)),snp_set = set_i,method ="RDA") }) %>%bind_rows(.id="gcm") }) %>%bind_rows() %>%bind_rows(prda_pred) %>%bind_rows(go_df)# ============================================================================================================# ============================================================================================================# For each combination of method (GF, GDM, LFMM, pRDA and RDA) and SNP set, # we calculate the average of the GO predictions across the five GCMs, # and then we rank the populations based on their GO predicted value.# First option (that we should use I think): take the mean of the GO predictionsgo_df <- go_df %>%group_by(pop,method,snp_set) %>%summarise(go=mean(go)) %>%ungroup() %>%group_split(method,snp_set) %>%lapply(function(x){ x %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names)),gcm ="GCMs_average") }) %>%bind_rows() %>%bind_rows(go_df)# Another option (the first I use, but I think we should use the option above): take the mean of the ranks# go_df %>%# group_by(pop,method,snp_set) %>%# summarise(mean_rank=mean(rank),go=mean(go)) %>%# ungroup() %>%# group_split(method,snp_set) %>%# lapply(function(x){# x %>%# arrange(mean_rank) %>%# mutate(rank = 1:length(pop_names),# gcm = "GCM average") %>%# dplyr::select(-mean_rank)# }) %>%# bind_rows() %>%# bind_rows(go_df)# =====================================# =====================================# FIGURE TITLE AND LABELS# We create a vector that we will use for the x-axis labels and the plot titlego_df <-lapply(snp_sets, function(set_i){lapply(meth_names, function(meth_i){tibble(method_snpset_names =paste0(meth_i, " - ", set_i$set_name),meth_snpset =paste0(meth_i, "_", set_i$set_code),snpset_names = set_i$set_name,method = meth_i,snp_set = set_i$set_code) }) }) %>% bind_rows %>%left_join(go_df, by =c("method", "snp_set"))# save for the shiny appgo_df %>%mutate(gcm =if_else(gcm =="GCMs_average", "Average across GCMs", gcm)) %>%saveRDS(here("shiny/PredictionVariability/go_df.rds"))
Code
# Which method do we show on the graph? selected_meth <- meth_names# Which SNP set do we show? selected_sets <-names(snp_sets)[c(1,6)]# Subseting the dataset to keep only the method and SNP sets that we want for the figuressub_df <- go_df %>%filter(snp_set %in% selected_sets, method %in% selected_meth)
Code
# =================================# Code the generate the bump charts# =================================# the colors I will use to color the populations with high GOmy_colors <-c(brewer.pal(n=12, "Paired"),"#FF40EE","cyan","yellow")bump_charts <- sub_df %>%#filter(! (snp_set == "cand_corrected")) %>% # mutate(x_axis = method_snpset_names[paste0(method, "_", snp_set)]) %>%group_split(gcm) %>%lapply(function(x){high_go_pops <- x %>%filter(rank <4) %>%pull(pop) %>%unique() my_palette <-c(my_colors[1:length(high_go_pops)],"#E8E8E8")if(unique(x$gcm)=="GCMs_average"){ plot_title <-"Average across the five GCMs"} else { plot_title <-unique(x$gcm)}sub <- x %>%mutate(flag =ifelse(pop %in% high_go_pops, TRUE, FALSE),pop_col =if_else(flag ==TRUE, pop, "Others")) %>%mutate(pop =factor(pop, levels=c(setdiff(unique(x$pop),high_go_pops),high_go_pops)),pop_col =factor(pop_col, levels =c(high_go_pops,"Others")))p <- sub %>%ggplot(aes(x = method_snpset_names, y = rank, group = pop)) +geom_point(aes(color = pop_col), size =2, alpha =0.9) +geom_line(aes(color = pop_col), linewidth =2, alpha =0.8) +scale_y_reverse(breaks =1:nrow(sub)) +scale_color_manual(values = my_palette) +geom_text(data = sub %>%filter(method_snpset_names ==last(levels(factor(sub$method_snpset_names))) & pop %in% high_go_pops),aes(label = pop, x =length(unique(sub$method_snpset_names)) +0.1) , hjust =0.15, color ="gray20", size =3) +#color = "#888888",theme_bw() +ylab("Population rank") +ggtitle(plot_title) +theme(panel.grid.minor.y =element_blank(),panel.grid.major.y =element_blank(),panel.grid.major.x =element_blank(),legend.position ="none",axis.text.y =element_text(size=10),axis.text.x =element_text(size=11, angle =60, hjust =1),axis.title.y =element_text(size=16),axis.title.x =element_blank())ggsave(p, filename =here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),".pdf")), device ="pdf", width=11, height =7)if(unique(x$gcm)=="GCMs_average"){ p_manuscript <- p +ggtitle("") ggsave(p_manuscript, filename =here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),"_notitle.pdf")), device ="pdf", width=13)} p })pdf(here("figs/PredictionVariability/BumpCharts_PerMethSnpSet.pdf"), width=11, height =7)lapply(bump_charts, function(x) x)dev.off()bump_charts
Across GCMs
We then compared the population ranks from the different GCMs (or from the average GO predictions across the five GCMs). For that, we generated one plot per combination method/SNP set. We also generated a plot in which the population ranks are not based on an unique combination method/SNP set but are based on the average GO predictions across all combinations method/SNP set.
In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one GCM.
`summarise()` has grouped output by 'pop'. You can override using the `.groups` argument.
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.
Code
p %>%ggsave(here(paste0("figs/GOmeanPredictions_AllCandidateSNPs_AcrossGOMethods.pdf")),., width=6,height=6, device="pdf")p %>%ggsave(here(paste0("figs/GF/GOmeanProjections_AllCandidateSNPs_AcrossGOMethods.png")),., width=6,height=6)p
Climatic distance
We build a dataset with both the genomic offset predictions and the climatic distances.
Code
# Set of select climatic variablesclim_var <-readRDS(here("data/ClimaticData/NamesSelectedVariables.rds"))# Loading the climatic datasets (which are mean-centered and standardized)source(here("scripts/functions/generate_scaled_clim_datasets.R"))clim_dfs <-generate_scaled_clim_datasets(clim_var)# We calculate the climatic distances for each GCMclimdist <-lapply(clim_dfs$clim_pred, function(x){# Calculate absolute differences df <-abs(x[,clim_var] - clim_dfs$clim_ref[,clim_var]) %>%as_tibble() %>%bind_cols(x[c("pop","gcm")],.) %>%mutate(eucli =unique(sqrt(rowSums((clim_dfs$clim_ref[,clim_var] - x[,clim_var])^2))))}) %>%bind_rows()# We calcualte the climatic distances for the average across GCMsclimdist <- climdist %>%group_by(pop) %>%summarize(across(where(is.numeric), \(x) mean(x, na.rm =TRUE))) %>%mutate(gcm="GCMs_average") %>%bind_rows(climdist) %>%pivot_longer(cols =all_of(c(clim_var,"eucli")), names_to ="var_code", values_to ="val") %>%mutate(var_name =case_when(var_code =="bio1"~"Mean annual temperature (bio1, °C)", var_code =="bio12"~"Annual precipitation (bio12, mm)", var_code =="bio15"~"Precipitation seasonality (bio15, index)", var_code =="bio3"~"Isothermality (bio3, index)", var_code =="bio4"~"Temperature seasonality (bio4, °C)", var_code =="SHM"~"Summer heat moisture index (SHM, °C/mm)", var_code =="eucli"~"Euclidean climatic distance"),method_name ="Climatic distance")# We merge the climatic distances with the genomic offset predictionsgo_climdist_df <- go_df %>% dplyr::rename(method_name = method,var_code = snp_set,var_name = snpset_names,val = go) %>% dplyr::select(-method_snpset_names, -meth_snpset, -rank) %>%bind_rows(climdist) %>%mutate(gcm =if_else(gcm =="GCMs_average", "Average across GCMs", gcm)) %>%left_join(readRDS(here("data/GenomicData/MainGenePoolInformation.rds"))[[1]], by="pop")# We export the dataset for the shinny appgo_climdist_df %>%saveRDS(here("shiny/PredictionVariability/go_climdist_df.rds"))